First, let’s look at the TE breakpoint analysis

Dvir_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dvir_breakpoint_TEs_forR.csv', header=T)
Dvir_breakpoints
##     Ref_species Inversion    Type base_pairs
## 1          Dvir    In2b_d  DAIBAM          0
## 2          Dvir    In2b_d     DNA          0
## 3          Dvir    In2b_d     LTR          0
## 4          Dvir    In2b_d    LINE          0
## 5          Dvir    In2b_d Unknown          0
## 6          Dvir    In2b_d   Other          0
## 7          Dvir    In2b_d      RC          0
## 8          Dvir    In2b_p  DAIBAM          0
## 9          Dvir    In2b_p     DNA          0
## 10         Dvir    In2b_p     LTR        649
## 11         Dvir    In2b_p    LINE          0
## 12         Dvir    In2b_p Unknown        323
## 13         Dvir    In2b_p   Other          0
## 14         Dvir    In2b_p      RC       3817
## 15         Dvir    In2c_d  DAIBAM          0
## 16         Dvir    In2c_d     DNA          0
## 17         Dvir    In2c_d     LTR        136
## 18         Dvir    In2c_d    LINE        129
## 19         Dvir    In2c_d Unknown        111
## 20         Dvir    In2c_d   Other          0
## 21         Dvir    In2c_d      RC        112
## 22         Dvir    In2c_p  DAIBAM          0
## 23         Dvir    In2c_p     DNA          0
## 24         Dvir    In2c_p     LTR          0
## 25         Dvir    In2c_p    LINE          0
## 26         Dvir    In2c_p Unknown         88
## 27         Dvir    In2c_p   Other          0
## 28         Dvir    In2c_p      RC          0
## 29         Dvir    In2a_d  DAIBAM        738
## 30         Dvir    In2a_d     DNA          0
## 31         Dvir    In2a_d     LTR          0
## 32         Dvir    In2a_d    LINE          0
## 33         Dvir    In2a_d Unknown        506
## 34         Dvir    In2a_d   Other          0
## 35         Dvir    In2a_d      RC          0
## 36         Dvir    In2a_p  DAIBAM       1249
## 37         Dvir    In2a_p     DNA        455
## 38         Dvir    In2a_p     LTR        297
## 39         Dvir    In2a_p    LINE         52
## 40         Dvir    In2a_p Unknown        691
## 41         Dvir    In2a_p   Other          0
## 42         Dvir    In2a_p      RC       1833
## 43         Dvir    In4a_d  DAIBAM          0
## 44         Dvir    In4a_d     DNA          0
## 45         Dvir    In4a_d     LTR          0
## 46         Dvir    In4a_d    LINE          0
## 47         Dvir    In4a_d Unknown          0
## 48         Dvir    In4a_d   Other          0
## 49         Dvir    In4a_d      RC          0
## 50         Dvir    In4a_p  DAIBAM          0
## 51         Dvir    In4a_p     DNA          0
## 52         Dvir    In4a_p     LTR          0
## 53         Dvir    In4a_p    LINE          0
## 54         Dvir    In4a_p Unknown          0
## 55         Dvir    In4a_p   Other          0
## 56         Dvir    In4a_p      RC          0
## 57         Dvir    In5b_d  DAIBAM          0
## 58         Dvir    In5b_d     DNA          0
## 59         Dvir    In5b_d     LTR         68
## 60         Dvir    In5b_d    LINE          0
## 61         Dvir    In5b_d Unknown         77
## 62         Dvir    In5b_d   Other          0
## 63         Dvir    In5b_d      RC          0
## 64         Dvir    In5b_p  DAIBAM          0
## 65         Dvir    In5b_p     DNA          0
## 66         Dvir    In5b_p     LTR          0
## 67         Dvir    In5b_p    LINE          0
## 68         Dvir    In5b_p Unknown          0
## 69         Dvir    In5b_p   Other          0
## 70         Dvir    In5b_p      RC        139
## 71         Dvir    InXc_d  DAIBAM          0
## 72         Dvir    InXc_d     DNA          0
## 73         Dvir    InXc_d     LTR          0
## 74         Dvir    InXc_d    LINE          0
## 75         Dvir    InXc_d Unknown          0
## 76         Dvir    InXc_d   Other          0
## 77         Dvir    InXc_d      RC          0
## 78         Dvir    InXc_p  DAIBAM          0
## 79         Dvir    InXc_p     DNA          0
## 80         Dvir    InXc_p     LTR        786
## 81         Dvir    InXc_p    LINE          0
## 82         Dvir    InXc_p Unknown        322
## 83         Dvir    InXc_p   Other          0
## 84         Dvir    InXc_p      RC        423
## 85         Dvir    InXb_d  DAIBAM          0
## 86         Dvir    InXb_d     DNA          0
## 87         Dvir    InXb_d     LTR          0
## 88         Dvir    InXb_d    LINE          0
## 89         Dvir    InXb_d Unknown          0
## 90         Dvir    InXb_d   Other          0
## 91         Dvir    InXb_d      RC          0
## 92         Dvir    InXb_p  DAIBAM          0
## 93         Dvir    InXb_p     DNA       1006
## 94         Dvir    InXb_p     LTR       2149
## 95         Dvir    InXb_p    LINE       2262
## 96         Dvir    InXb_p Unknown        644
## 97         Dvir    InXb_p   Other          0
## 98         Dvir    InXb_p      RC       8601
## 99         Dvir    InXa_d  DAIBAM        555
## 100        Dvir    InXa_d     DNA          0
## 101        Dvir    InXa_d     LTR          0
## 102        Dvir    InXa_d    LINE          0
## 103        Dvir    InXa_d Unknown          0
## 104        Dvir    InXa_d   Other          0
## 105        Dvir    InXa_d      RC          0
## 106        Dvir    InXa_p  DAIBAM          0
## 107        Dvir    InXa_p     DNA          0
## 108        Dvir    InXa_p     LTR          0
## 109        Dvir    InXa_p    LINE          0
## 110        Dvir    InXa_p Unknown          0
## 111        Dvir    InXa_p   Other          0
## 112        Dvir    InXa_p      RC          0
library(ggplot2)
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2021a.
## 2.0/zoneinfo/America/New_York'
ggplot() +
  geom_bar(data = Dvir_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
  scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw() + 
  theme(legend.title=element_blank()) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab("Percent of region") +
  theme(text = element_text(size=15))
## Warning: Removed 42 rows containing missing values (geom_bar).

Dnov_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dnov_breakpoints_TEs_forR.csv', header=T)

ggplot() +
  geom_bar(data = Dnov_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
  scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw() + 
  theme(legend.title=element_blank()) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
   ylab("Percent of region") +
  theme(text = element_text(size=15))
## Warning: Removed 7 rows containing missing values (geom_bar).

Dame_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dame_breakpoints_TEs_forR.csv', header=T)

ggplot() +
  geom_bar(data = Dame_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
  scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw() + 
  theme(legend.title=element_blank()) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ylab("Percent of region") +
  theme(text = element_text(size=15))
## Warning: Removed 42 rows containing missing values (geom_bar).

Genome-wide TE annotation: pie charts

library("ggplot2")


Dvir_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dvir_masking.csv', header=T)
Dvir_masking
##             Type nt_masked  total_nt perc_mask
## 1           LINE  10245012 189680791  5.401186
## 2            DNA   2823009 189680791  1.488295
## 3            LTR  21901494 189680791 11.546501
## 4 Helitron/DINEs  12751857 189680791  6.722798
## 5          Other   3053549 189680791  1.609836
## 6        Unknown   5100259 189680791  2.688864
## 7         Non-TE 133805611 189680791 70.542521
Dvir_masking$Type <- factor(Dvir_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dvir_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

# virilis old genome.
Dvir_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/virilis_caf1.csv', header=T)
Dvir_masking
##      Type masked_nt  total_nt  perc_mask
## 1     LTR  17456713 189206400  9.2262804
## 2    LINE   9588644 189206400  5.0678222
## 3     DNA   1338553 189206400  0.7074565
## 4      RC   5670391 189206400  2.9969340
## 5 Unknown  21340230 189206400 11.2788098
## 6  Non-TE 133811869 189206400 70.7226970
Dvir_masking$Type <- factor(Dvir_masking$Type, levels=c("DNA", "RC", "LTR", "LINE", "Unknown", "Non-TE"))

ggplot (Dvir_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC")) +
  theme_void()

# Dnovamexicana

Dnov_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dnov_masking.csv', header=T)
Dnov_masking
##             Type nt_masked  total_nt  perc_mask
## 1           LINE   6281676 182394971  3.4439963
## 2            DNA    894270 182394971  0.4902931
## 3            LTR  11169301 182394971  6.1236891
## 4 Helitron/DINEs  11289884 182394971  6.1898000
## 5          Other   7673241 182394971  4.2069367
## 6        Unknown   5411510 182394971  2.9669184
## 7         Non-TE 139675089 182394971 76.5783663
Dnov_masking$Type <- factor(Dnov_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dnov_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

# D americana

Dame_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dame_masking.csv', header=T)
Dame_masking
##             Type nt_masked  total_nt  perc_mask
## 1           LINE   8062670 211471196  3.8126564
## 2            DNA   1288628 211471196  0.6093634
## 3            LTR  10142700 211471196  4.7962560
## 4 Helitron/DINEs  28521833 211471196 13.4873371
## 5          Other   9166368 211471196  4.3345705
## 6        Unknown   4845328 211471196  2.2912473
## 7         Non-TE 149443669 211471196 70.6685694
Dame_masking$Type <- factor(Dame_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dame_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

Genome-wide and Y-specific TE annotation: landscape plots

Dvir_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Apr9_dvir_mod/Dvir_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dvir_landscape$V1
types
## [1] Other   LTR     DNA     DINE    LINE    RC      Unknown
## Levels: DINE DNA LINE LTR Other RC Unknown
Dvir_landscape$V1 <- NULL
rownames(Dvir_landscape) <- types
RC_DINE <- Dvir_landscape["DINE",] + Dvir_landscape["RC",]
to_remove <- c("DINE", "RC")
Dvir_landscape_edit <- Dvir_landscape[!row.names(Dvir_landscape)%in%to_remove,]
Dvir_landscape_edit2 <- rbind(Dvir_landscape_edit, RC_DINE)
Dvir_landscape_edit2
##              V2      V3      V4      V5      V6      V7      V8      V9
## Other        32      88      40     293      47     280    1798    1366
## LTR     2759332 1397409 1317267 1275702 1089392 1326759 1156997 1024066
## DNA      198823   54226  116348  393031  417977  353076  180435  177560
## LINE     888723 1825663  786903  765242  544501  474068  555860  329650
## Unknown  212580   83833  107727  106931  138050  156283  127209  171456
## DINE     145202  118866  157942  330471  626207  817256  803570  827175
##             V10     V11     V12    V13    V14    V15    V16    V17    V18
## Other      5463   29386   75226 117274 159211 298042 461193 339794 382328
## LTR      976129  745479  751488 704309 675488 720830 639797 549795 442516
## DNA      105077   91533   65248  67084  58969  58590  55899  47702  43585
## LINE     232108  164667  185111 234833 265122 192473 168230 175475 184472
## Unknown  138048  146374  196972 199331 146371 155603 148140 180721 179929
## DINE    1061538 1172542 1030390 555089 345377 323574 340841 425320 356125
##            V19    V20    V21    V22    V23    V24    V25    V26    V27
## Other   232149 357786 246011 173314  52782  41802  35676  22189   8836
## LTR     405957 402708 401342 338518 277580 297462 257488 186664 215339
## DNA      48646  40907  35617  34734  32283  23356  22798  21464  19854
## LINE    189020 201204 189857 195557 200234 174941 180720 186384 166611
## Unknown 179076 214708 215945 267813 272711 309622 219828 224705 198708
## DINE    362969 348372 334102 371618 405743 351483 291181 238222 201068
##            V28    V29    V30    V31    V32    V33    V34   V35   V36   V37
## Other     3445   4166   1609   1448     89    263    107    16     0     0
## LTR     195932 168675 192037 189576 230217 157962 122727 82590 37683 24743
## DNA      12098   8392  17669   8791   4888   4002   1468   420   398     0
## LINE    140322 109860  79331  67316  56746  57216  27478 13627 11726  9881
## Unknown 122138 103013  64690  43788  28373  20899   8992  4974  3387   653
## DINE    148273 102247  70323  40556  24305  13928   6778  2149   741   266
##           V38  V39  V40 V41
## Other       0    0    0   0
## LTR     14244 3773 2469 680
## DNA        61    0    0   0
## LINE     2904 8443 2533   0
## Unknown   588   14   76   0
## DINE        0   14    0   0
# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dvir_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dvir_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dvir_landscape_edit2[i,1:ncol(Dvir_landscape_edit2)]))
}

Dvir_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_landscape_df)
##    Type Window Prop
## 1 Other      1   32
## 2 Other      2   88
## 3 Other      3   40
## 4 Other      4  293
## 5 Other      5   47
## 6 Other      6  280
# still need to divide by the appropriate number
Dvir_genome_size <- 189680791
Dvir_landscape_df$Prop <- Dvir_landscape_df$Prop/Dvir_genome_size

# remove the SINE
Dvir_landscape_df_mod <- Dvir_landscape_df[which(Dvir_landscape_df$Type!="SINE"),]

Dvir_landscape_df_mod$Type <- factor(Dvir_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dvir_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  ylim(0,0.0225) +
  theme_bw()

#### Dvir Y chrom  ####

Dvir_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dvir_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dvir_Y_landscape$V1
types
## [1] LTR     RC      LINE    Unknown DINE    DNA    
## Levels: DINE DNA LINE LTR RC Unknown
Dvir_Y_landscape$V1 <- NULL
rownames(Dvir_Y_landscape) <- types
RC_DINE <- Dvir_Y_landscape["DINE",] + Dvir_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dvir_Y_landscape_edit <- Dvir_Y_landscape[!row.names(Dvir_Y_landscape)%in%to_remove,]
Dvir_Y_landscape_edit2 <- rbind(Dvir_Y_landscape_edit, RC_DINE)

# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dvir_Y_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dvir_Y_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dvir_Y_landscape_edit2[i,1:ncol(Dvir_Y_landscape_edit2)]))
}

Dvir_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_Y_landscape_df)
##   Type Window   Prop
## 1  LTR      1 387055
## 2  LTR      2 452240
## 3  LTR      3 600157
## 4  LTR      4 603886
## 5  LTR      5 542391
## 6  LTR      6 549035
# still need to divide by the appropriate number
Dvir_Y_size <- 14824018
Dvir_Y_landscape_df$Prop <- Dvir_Y_landscape_df$Prop/Dvir_Y_size

# remove the SINE
Dvir_Y_landscape_df_mod <- Dvir_Y_landscape_df[which(Dvir_Y_landscape_df$Type!="SINE"),]

Dvir_Y_landscape_df_mod$Type <- factor(Dvir_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dvir_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC",  "#00CCFF")) +
  ylim(0,0.125) +
  theme_bw()

######
##Dnov

Dnov_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dnov_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dnov_landscape$V1
types
## [1] LINE    Other   DNA     Unknown LTR     RC      SINE    DINE   
## Levels: DINE DNA LINE LTR Other RC SINE Unknown
Dnov_landscape$V1 <- NULL
rownames(Dnov_landscape) <- types
RC_DINE <- Dnov_landscape["DINE",] + Dnov_landscape["RC",]
to_remove <- c("DINE", "RC")
Dnov_landscape_edit <- Dnov_landscape[!row.names(Dnov_landscape)%in%to_remove,]
Dnov_landscape_edit2 <- rbind(Dnov_landscape_edit, RC_DINE)
Dnov_landscape_edit2
##              V2     V3     V4     V5     V6     V7      V8      V9    V10
## LINE     666640 729920 437873 355817 312038 301572  290409  165507 133375
## Other     29001    991    468 153232  25881  53056  238413   70350 127523
## DNA       61443  25899  53278  57070  40388  38933   28909   31969  29020
## Unknown  576183  89122 112566 160143 135381 121178   94068   95364  82662
## LTR     1370306 739714 611444 570504 596322 458092  428656  418732 438645
## SINE          0      0      0      0      0    222     146       0    440
## DINE      22016 110777 130047 193779 447657 782248 1039331 1132728 885255
##            V11    V12    V13    V14    V15    V16     V17     V18    V19
## LINE    124750 143252 122111 143739 124972 134259  163202  155796 143143
## Other   207687 100252 429010 452917 597315 847750 1794867 1412014 526405
## DNA      39984  32846  27321  24952  22282  18774   29377   40824  48038
## Unknown  90219 104636  93560 126420 154453 140755  161782  166137 192379
## LTR     389911 424319 328714 228280 262493 253967  243258  213140 230401
## SINE        74   1120      0    399    136      0     360     150     75
## DINE    731183 582840 454148 365468 348149 364206  400064  471029 270405
##            V20    V21    V22    V23    V24    V25    V26    V27    V28
## LINE    145807 148281 143347 134430 122546 127465 108247  91694  84593
## Other   336611  94241  81006  45266  11753   9658  15678   4317   2839
## DNA      37417  34165  24274  22080  21838  22812  17225  14084  10571
## Unknown 210071 239042 266531 286149 282942 324164 262746 236653 180718
## LTR     239281 211439 254110 314319 187124 158578 149229 156721 151432
## SINE         0      0      0      0    114      0      0      0      0
## DINE    318400 284890 294826 282726 285688 288269 222889 189121 148098
##            V29    V30    V31    V32    V33    V34   V35   V36   V37   V38
## LINE     69338  65137  76447  79895  68733  59970 42895 20355 26152 10466
## Other     1463   1200    783    688    198    127     0   135   146     0
## DNA      10617   8748   5287   2489   3991   1175  1703  1848   304     0
## Unknown 150874 107046  72175  41392  24904  11675  9240  4321   932  1869
## LTR     150721 174365 206094 174096 127394 123091 98294 48947 23265  9356
## SINE         0      0      0      0      0      0     0     0     0     0
## DINE    103318  67281  33170  21661  10181   5969  1119   579   369     0
##          V39  V40 V41
## LINE    5211 2221  27
## Other      0    0   0
## DNA        0    0   0
## Unknown  652  406   0
## LTR     3335  771   0
## SINE       0    0   0
## DINE       0    0   0
# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dnov_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dnov_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dnov_landscape_edit2[i,1:ncol(Dnov_landscape_edit2)]))
}

Dnov_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
Dnov_genome_size <- 182394971
Dnov_landscape_df$Prop <- Dnov_landscape_df$Prop/Dnov_genome_size

# remove the SINE
Dnov_landscape_df_mod <- Dnov_landscape_df[which(Dnov_landscape_df$Type!="SINE"),]

Dnov_landscape_df_mod$Type <- factor(Dnov_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dnov_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  ylim(0,0.0225) +
  theme_bw()

#### Dnov Y chrom ###

Dnov_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dnov_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dnov_Y_landscape$V1

Dnov_Y_landscape$V1 <- NULL
rownames(Dnov_Y_landscape) <- types
RC_DINE <- Dnov_Y_landscape["DINE",] + Dnov_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dnov_Y_landscape_edit <- Dnov_Y_landscape[!row.names(Dnov_Y_landscape)%in%to_remove,]
Dnov_Y_landscape_edit2 <- rbind(Dnov_Y_landscape_edit, RC_DINE)

# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dnov_Y_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dnov_Y_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dnov_Y_landscape_edit2[i,1:ncol(Dnov_Y_landscape_edit2)]))
}

Dnov_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
Dnov_Y_size <- 8975053
Dnov_Y_landscape_df$Prop <- Dnov_Y_landscape_df$Prop/Dnov_Y_size

# remove the SINE
Dnov_Y_landscape_df_mod <- Dnov_Y_landscape_df[which(Dnov_Y_landscape_df$Type!="SINE"),]

Dnov_Y_landscape_df_mod$Type <- factor(Dnov_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dnov_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  ylim(0,0.125) +
  theme_bw()

### Now Dame ####
Dame_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dame_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dame_landscape$V1

Dame_landscape$V1 <- NULL
rownames(Dame_landscape) <- types
RC_DINE <- Dame_landscape["DINE",] + Dame_landscape["RC",]
to_remove <- c("DINE", "RC")
Dame_landscape_edit <- Dame_landscape[!row.names(Dame_landscape)%in%to_remove,]
Dame_landscape_edit2 <- rbind(Dame_landscape_edit, RC_DINE)


# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dame_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dame_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dame_landscape_edit2[i,1:ncol(Dame_landscape_edit2)]))
}

Dame_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
Dame_genome_size <- 211471196
Dame_landscape_df$Prop <- Dame_landscape_df$Prop/Dame_genome_size

# remove the SINE
Dame_landscape_df_mod <- Dame_landscape_df[which(Dame_landscape_df$Type!="SINE"),]

Dame_landscape_df_mod$Type <- factor(Dame_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dame_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  ylim(0,0.0225) +
  theme_bw()

#### Dame Y ######

Dame_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dame_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dame_Y_landscape$V1

Dame_Y_landscape$V1 <- NULL
rownames(Dame_Y_landscape) <- types
RC_DINE <- Dame_Y_landscape["DINE",] + Dame_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dame_Y_landscape_edit <- Dame_Y_landscape[!row.names(Dame_Y_landscape)%in%to_remove,]
Dame_Y_landscape_edit2 <- rbind(Dame_Y_landscape_edit, RC_DINE)

# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dame_Y_landscape_edit2)) {
  Types <- c(Types, rep(rownames(Dame_Y_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dame_Y_landscape_edit2[i,1:ncol(Dame_Y_landscape_edit2)]))
}

Dame_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
Dame_Y_size <- 24076955
Dame_Y_landscape_df$Prop <- Dame_Y_landscape_df$Prop/Dame_Y_size

# remove the SINE
Dame_Y_landscape_df_mod <- Dame_Y_landscape_df[which(Dame_Y_landscape_df$Type!="SINE"),]

Dame_Y_landscape_df_mod$Type <- factor(Dame_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(Dame_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  ylim(0,0.125) +
  theme_bw()

Now, look specifically at LINE differences

Dvir_LINEs <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/LINEs/Dvir_landscape.LINE.Rfam.txt', header=F, sep="\t")

Dvir_LINEs$V1 <- NULL

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dvir_LINEs)) {
  Types <- c(Types, rep(as.character(Dvir_LINEs[i,1]), times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dvir_LINEs[i,2:ncol(Dvir_LINEs)]))
}

Dvir_LINEs_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_LINEs_df)
##       Type Window   Prop
## 1 I-Jockey      1 103793
## 2 I-Jockey      2 456773
## 3 I-Jockey      3 148221
## 4 I-Jockey      4 400683
## 5 I-Jockey      5 249661
## 6 I-Jockey      6 141288
# still need to divide by the appropriate number
Dvir_genome_size <- 189680791
Dvir_LINEs_df$Prop <- Dvir_LINEs_df$Prop/Dvir_genome_size

# remove the SINE
Dvir_LINEs_df_mod <- Dvir_LINEs_df[which(Dvir_LINEs_df$Type!="LINE" & Dvir_LINEs_df$Type!="L2" & Dvir_LINEs_df$Type!="R1-LOA" & Dvir_LINEs_df$Type!="I"),]

Dvir_LINEs_df_mod$Type <- factor(Dvir_LINEs_df_mod$Type, levels=c("CR1", "I-Jockey", "Penelope", "R1"))

# stacked barplot
ggplot(Dvir_LINEs_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#FFCC00", "#9966FF")) +
  #ylim(0,0.037) +
  theme_bw()

## now Dnov

Dnov_LINEs <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/LINEs/Dnov_landscape.LINE.Rfam.txt', header=F, sep="\t")

Dnov_LINEs$V1 <- NULL

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(Dnov_LINEs)) {
  Types <- c(Types, rep(as.character(Dnov_LINEs[i,1]), times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(Dnov_LINEs[i,2:ncol(Dnov_LINEs)]))
}

Dnov_LINEs_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dnov_LINEs_df)
##       Type Window   Prop
## 1 I-Jockey      1 460641
## 2 I-Jockey      2 252512
## 3 I-Jockey      3  64731
## 4 I-Jockey      4  20796
## 5 I-Jockey      5  28060
## 6 I-Jockey      6  39746
# still need to divide by the appropriate number
Dnov_genome_size <- 182394971
Dnov_LINEs_df$Prop <- Dnov_LINEs_df$Prop/Dnov_genome_size


# remove the SINE
Dnov_LINEs_df_mod <- Dnov_LINEs_df[which(Dnov_LINEs_df$Type!="LINE" & Dnov_LINEs_df$Type!="L2" & Dnov_LINEs_df$Type!="R1-LOA" & Dnov_LINEs_df$Type!="I"),]

Dnov_LINEs_df_mod$Type <- factor(Dnov_LINEs_df_mod$Type, levels=c("CR1", "I-Jockey", "Penelope", "R1"))

# stacked barplot
ggplot(Dnov_LINEs_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#FFCC00", "#9966FF")) +
  ylim(0,0.01) +
  theme_bw()

Now, Y chromosome only annotation: pie charts

Dame_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dame_Y_masking.csv', header=T)
Dame_Y
##             Type nt_masked total_nt    perc_mask
## 1           LINE   1906166 24076955  7.916972890
## 2            DNA     40256 24076955  0.167197222
## 3            LTR   3899143 24076955 16.194502170
## 4 Helitron/DINEs  16691516 24076955 69.325693390
## 5          Other       694 24076955  0.002882424
## 6        Unknown    126786 24076955  0.526586522
## 7         Non-TE   1412394 24076955  5.866165385
Dame_Y$Type <- factor(Dame_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dame_Y, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

### everything but Y
Dame_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dame_allbutY_masking.csv', header=T)
Dame_nonY
##             Type nt_masked  total_nt  perc_mask
## 1           LINE   6153409 187394241  3.2836703
## 2            DNA   1249870 187394241  0.6669735
## 3            LTR   6211930 187394241  3.3148991
## 4 Helitron/DINEs  11828129 187394241  6.3118957
## 5          Other   9154695 187394241  4.8852595
## 6        Unknown   4683844 187394241  2.4994599
## 7         Non-TE 148112364 187394241 79.0378419
Dame_nonY$Type <- factor(Dame_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dame_nonY, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

Dnov_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dnov_Y_masking.csv', header=T)
Dnov_Y
##             Type nt_masked total_nt  perc_mask
## 1           LINE   1721083  8975053 19.1762990
## 2            DNA     19007  8975053  0.2117759
## 3            LTR   3680934  8975053 41.0129500
## 4 Helitron/DINEs   2203574  8975053 24.5522116
## 5          Other     34497  8975053  0.3843654
## 6        Unknown    244302  8975053  2.7220118
## 7         Non-TE   1071656  8975053 11.9403863
Dnov_Y$Type <- factor(Dame_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dnov_Y, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

### everything but Y

Dnov_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dnov_allbutY_masking.csv', header=T)

Dnov_nonY$Type <- factor(Dnov_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dnov_nonY, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

Dvir_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dvir_Y_masking.csv', header=T)
Dvir_Y
##             Type nt_masked total_nt perc_mask
## 1           LINE   1895813 14824018 12.788793
## 2            DNA    435272 14824018  2.936262
## 3            LTR   6596624 14824018 44.499568
## 4 Helitron/DINEs   4344957 14824018 29.310252
## 5          Other         0 14824018  0.000000
## 6        Unknown    323572 14824018  2.182755
## 7         Non-TE   1227780 14824018  8.282370
Dvir_Y$Type <- factor(Dvir_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dvir_Y, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

### everything but Y

Dvir_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dvir_allbutY_masking.csv', header=T)
Dvir_nonY
##             Type nt_masked  total_nt perc_mask
## 1           LINE   8349199 174856773  4.774879
## 2            DNA   2387737 174856773  1.365539
## 3            LTR  15162711 174856773  8.671503
## 4 Helitron/DINEs   8406865 174856773  4.807858
## 5          Other   3053549 174856773  1.746314
## 6        Unknown   4776687 174856773  2.731771
## 7         Non-TE 132720025 174856773 75.902136
Dvir_nonY$Type <- factor(Dvir_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (Dvir_nonY, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

Now, chromosome-level TE density in 100 kb windows

Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_100kb.intervals.sizes', header=F, sep="\t")

Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_100kb.intervals_TE.density', header=F, sep="\t", as.is = T)

#Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_intervals.sizes', header=F, sep="\t")

#Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_intervals_TE.density', header=F, sep="\t", as.is = T)

Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dame_100kb.intervals.sizes.redo', header=F, sep="\t")

Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dame_100kb.intervals_TE.density.redo', header=F, sep="\t", as.is = T)

Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dnov_100kb.intervals.sizes', header=F, sep="\t")

Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dnov_100kb.intervals_TE.density', header=F, sep="\t", as.is = T)


sizes <- as.numeric(Dvir_interval_sizes$V2)
cluster_labels <- Dvir_TEdens$V1
Dvir_TEdens$V1 <- NULL
rownames(Dvir_TEdens) <- cluster_labels
colnames(Dvir_TEdens) <- c("LTR", "RC", "LINE", "DNA", "Unknown")
head(Dvir_TEdens)
##                      LTR   RC LINE  DNA Unknown   NA
## Chr_2-0-100000      1698  133 5594  303   20020  986
## Chr_2-100000-200000  843    0 9506  115     123 1326
## Chr_2-200000-300000 1019  305 3140    0   30728 1558
## Chr_2-300000-400000  358  623 2487    0     460 1965
## Chr_2-400000-500000  328 1397 2766   65   14207 3720
## Chr_2-500000-600000  566  447 2093 1121      42 2305
Dvir_TEdens_mx <- data.matrix(Dvir_TEdens)

Dvir_TEdens_norm <- Dvir_TEdens_mx/sizes
nrow(Dvir_TEdens_norm)
## [1] 1565
length(sizes) # need to rerun to get the correct size
## [1] 1565
head(Dvir_TEdens_norm)
##                         LTR      RC    LINE     DNA Unknown    <NA>
## Chr_2-0-100000      0.01698 0.00133 0.05594 0.00303 0.20020 0.00986
## Chr_2-100000-200000 0.00843 0.00000 0.09506 0.00115 0.00123 0.01326
## Chr_2-200000-300000 0.01019 0.00305 0.03140 0.00000 0.30728 0.01558
## Chr_2-300000-400000 0.00358 0.00623 0.02487 0.00000 0.00460 0.01965
## Chr_2-400000-500000 0.00328 0.01397 0.02766 0.00065 0.14207 0.03720
## Chr_2-500000-600000 0.00566 0.00447 0.02093 0.01121 0.00042 0.02305
# now need to put into a different format for plotting

Types <- c()
clusters <- c()
Proportion <- c()
IDs <- c()



for (i in 1:nrow(Dvir_TEdens_norm)) {
  Types <- c(Types, "LTR", "LINE", "RC", "DNA", "Other", "Unknown")
  clusters <- c(clusters, rep(rownames(Dvir_TEdens_norm)[i], times=6))
  Proportion <- c(Proportion, as.numeric(Dvir_TEdens_norm[i,1:ncol(Dvir_TEdens_norm)]))
  IDs <- c(IDs, rep (i, times=6))
}


#Dvir_TEdens_df <- data.frame(Clusters=clusters, Type=Types, Prop=Proportion, Chr=chrom)
Dvir_TEdens_df <- data.frame(Clusters=clusters, Type=Types, Prop=Proportion, ID=IDs)
head(Dvir_TEdens_df)
##         Clusters    Type    Prop ID
## 1 Chr_2-0-100000     LTR 0.01698  1
## 2 Chr_2-0-100000    LINE 0.00133  1
## 3 Chr_2-0-100000      RC 0.05594  1
## 4 Chr_2-0-100000     DNA 0.00303  1
## 5 Chr_2-0-100000   Other 0.20020  1
## 6 Chr_2-0-100000 Unknown 0.00986  1
temp1 <- strsplit(as.character(Dvir_TEdens_df$Clusters), "-")
Dvir_TEdens_df$Chr <- sapply(temp1, function(x) x[1])

Dvir_TEdens_df$Type <- factor(Dvir_TEdens_df$Type, levels=c("DNA", "RC", "LTR", "LINE", "Other", "Unknown"))

# make a plot for each Chr like piRNA clusters.
Dvir_TEdens_df_Chr2 <- subset(Dvir_TEdens_df, Chr == "Chr_2")
head(Dvir_TEdens_df_Chr2)
##         Clusters    Type    Prop ID   Chr
## 1 Chr_2-0-100000     LTR 0.01698  1 Chr_2
## 2 Chr_2-0-100000    LINE 0.00133  1 Chr_2
## 3 Chr_2-0-100000      RC 0.05594  1 Chr_2
## 4 Chr_2-0-100000     DNA 0.00303  1 Chr_2
## 5 Chr_2-0-100000   Other 0.20020  1 Chr_2
## 6 Chr_2-0-100000 Unknown 0.00986  1 Chr_2
# keep working on this.

library(ggplot2)

# x should be ID, not cluster. the clusters will be sorted wrong by R.

ggplot(Dvir_TEdens_df_Chr2, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

ggplot() +
  geom_bar(data = Dvir_TEdens_df_Chr2, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1.1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Dvir_TEdens_df_Chr3 <- subset(Dvir_TEdens_df, Chr == "Chr_3") 

tail(Dvir_TEdens_df_Chr3)
##                     Clusters    Type        Prop  ID   Chr
## 4045 Chr_3-27200000-27274493     LTR 0.008671956 675 Chr_3
## 4046 Chr_3-27200000-27274493    LINE 0.031278107 675 Chr_3
## 4047 Chr_3-27200000-27274493      RC 0.045628448 675 Chr_3
## 4048 Chr_3-27200000-27274493     DNA 0.001463225 675 Chr_3
## 4049 Chr_3-27200000-27274493   Other 0.000000000 675 Chr_3
## 4050 Chr_3-27200000-27274493 Unknown 0.036379257 675 Chr_3
# i think there's an issue here - fixed it!
# Chr_3-28200000-28253658 9251    2153    44795   2308    0       0

ggplot(Dvir_TEdens_df_Chr3, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,0.8) +
  theme_bw() +
  theme(axis.text.x=element_blank())

ggplot() +
  geom_bar(data = Dvir_TEdens_df_Chr3, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1.1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Dvir_TEdens_df_Chr4 <- subset(Dvir_TEdens_df, Chr == "Chr_4" ) 

ggplot(Dvir_TEdens_df_Chr4, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,0.6) +
  theme_bw() +
  theme(axis.text.x=element_blank())

ggplot() +
  geom_bar(data = Dvir_TEdens_df_Chr4, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1.1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Dvir_TEdens_df_Chr5 <- subset(Dvir_TEdens_df, Chr == "Chr_5") 

ggplot(Dvir_TEdens_df_Chr5, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

ggplot() +
  geom_bar(data = Dvir_TEdens_df_Chr5, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1.1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Dvir_TEdens_df_Chr6 <- subset(Dvir_TEdens_df, Chr == "Chr_6") 

ggplot(Dvir_TEdens_df_Chr6, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Dvir_TEdens_df_ChrX <- subset(Dvir_TEdens_df, Chr == "Chr_X") 

ggplot(Dvir_TEdens_df_ChrX, aes(x = ID, y = Prop)) +
  geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
  scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

ggplot() +
  geom_bar(data = Dvir_TEdens_df_ChrX, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
  ylim(0,1.1) +
  theme_bw() +
  theme(axis.text.x=element_blank())

Make histogram of amount of DAIBAM in randomly selected regions in Dnov, of the same size as the breakpoints.

dnov_shuffle_daibam <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dnov_shuffle_DAIBAM.amounts', header=F, sep="\t", as.is = T)

head(dnov_shuffle_daibam)
##   V1
## 1  0
## 2  0
## 3  0
## 4  0
## 5  0
## 6  0
dnov_shuffle_daibam <- as.vector(dnov_shuffle_daibam$V1)


hist(dnov_shuffle_daibam)

quantile(dnov_shuffle_daibam, probs = c(0.05, 0.95))
##  5% 95% 
##   0 120
quantile(dnov_shuffle_daibam, probs = c( 0.95, 0.99, 0.999))
##      95%      99%    99.9% 
##  120.000 2065.000 4030.991

Make histograms of ITR array length in the three species

dvir_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dvir.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dvir_ITRs)
##      V1     V2     V3  V4
## 1 Chr_2 293710 294023 313
## 2 Chr_2 294464 295271 807
## 3 Chr_2 296106 296445 339
## 4 Chr_2 301009 301359 350
## 5 Chr_2 308059 308420 361
## 6 Chr_2 309077 309433 356
dvir_tohist <- dvir_ITRs$V4[which(dvir_ITRs$V4<20000)]

hist(dvir_tohist)

dnov_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dnov.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dnov_ITRs)
##      V1     V2     V3    V4
## 1 Chr_2  92083  92257   174
## 2 Chr_2 117164 117287   123
## 3 Chr_2 204357 204506   149
## 4 Chr_2 248564 279145 30581
## 5 Chr_2 368809 368925   116
## 6 Chr_2 369624 369968   344
dnov_tohist <- dnov_ITRs$V4[which(dnov_ITRs$V4<20000)]
hist(dnov_tohist)

dame_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dame.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dame_ITRs)
##      V1     V2     V3     V4
## 1 Chr_2 221966 223121   1155
## 2 Chr_2 279513 279633    120
## 3 Chr_2 291842 300987   9145
## 4 Chr_2 372159 372306    147
## 5 Chr_2 375389 375494    105
## 6 Chr_2 415783 526485 110702
dam_tohist <- dame_ITRs$V4[which(dame_ITRs$V4<20000)]
#hist(dam_tohist, breaks=c(100,200,500,1000,3000,5000,10000,20000))
hist(dam_tohist)

# next make a histogram with ggplot, limiting the x axis so we don't see these outliers

D. americana Nanopore data

First, ML97.5

ML975_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/ML975_masking.csv', header=T)
ML975_masking
##             Type nt_masked  total_nt  perc_mask
## 1           LINE  11946512 236071707  5.0605437
## 2            DNA   1234587 236071707  0.5229712
## 3            LTR  11974885 236071707  5.0725625
## 4 Helitron/DINEs  40777050 236071707 17.2731627
## 5          Other  10081978 236071707  4.2707269
## 6        Unknown   5143846 236071707  2.1789337
## 7         Non-TE 154912849 236071707 65.6210992
ML975_masking$Type <- factor(ML975_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (ML975_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

## now the landscape plot ###

ML975_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/DameNanopore_ML975_canu.ME.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- ML975_landscape$V1

ML975_landscape$V1 <- NULL
rownames(ML975_landscape) <- types
RC_DINE <- ML975_landscape["DINE",] + ML975_landscape["RC",]
to_remove <- c("DINE", "RC")
ML975_landscape_edit <- ML975_landscape[!row.names(ML975_landscape)%in%to_remove,]
ML975_landscape_edit2 <- rbind(ML975_landscape_edit, RC_DINE)

# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(ML975_landscape_edit2)) {
  Types <- c(Types, rep(rownames(ML975_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(ML975_landscape_edit2[i,1:ncol(ML975_landscape_edit2)]))
}

ML975_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
ML975_size <- 236071707
ML975_landscape_df$Prop <- ML975_landscape_df$Prop/ML975_size

# remove the SINE
ML975_landscape_df_mod <- ML975_landscape_df[which(ML975_landscape_df$Type!="SINE"),]

ML975_landscape_df_mod$Type <- factor(ML975_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(ML975_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  #ylim(0,0.025) +
  theme_bw()

SB0206

SB0206_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/SB0206_masking.csv', header=T)
SB0206_masking
##             Type nt_masked  total_nt  perc_mask
## 1           LINE  12330108 240772203  5.1210679
## 2            DNA   1340327 240772203  0.5566785
## 3            LTR  13188648 240772203  5.4776456
## 4 Helitron/DINEs  35510110 240772203 14.7484259
## 5          Other  10481989 240772203  4.3534880
## 6        Unknown   5189270 240772203  2.1552613
## 7         Non-TE 162731751 240772203 67.5874328
SB0206_masking$Type <- factor(SB0206_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))

ggplot (SB0206_masking, aes(x = "", y = perc_mask, fill = Type)) +
  geom_bar(width=1, stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
  theme_void()

## now the landscape plot ###

SB0206_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/DameNanopore_SB0206_canu.ME.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- SB0206_landscape$V1

SB0206_landscape$V1 <- NULL
rownames(SB0206_landscape) <- types
RC_DINE <- SB0206_landscape["DINE",] + SB0206_landscape["RC",]
to_remove <- c("DINE", "RC")
SB0206_landscape_edit <- SB0206_landscape[!row.names(SB0206_landscape)%in%to_remove,]
SB0206_landscape_edit2 <- rbind(SB0206_landscape_edit, RC_DINE)

# ok, got it.

Types <- c()
windows <- c()
Proportion <- c()

for (i in 1:nrow(SB0206_landscape_edit2)) {
  Types <- c(Types, rep(rownames(SB0206_landscape_edit2)[i], times=40)) 
  windows <- c(windows, c(1:40))
  Proportion <- c(Proportion, as.numeric(SB0206_landscape_edit2[i,1:ncol(SB0206_landscape_edit2)]))
}

SB0206_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)

# still need to divide by the appropriate number
SB0206_size <- 240772203
SB0206_landscape_df$Prop <- SB0206_landscape_df$Prop/SB0206_size

# remove the SINE
SB0206_landscape_df_mod <- SB0206_landscape_df[which(SB0206_landscape_df$Type!="SINE"),]

SB0206_landscape_df_mod$Type <- factor(SB0206_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))

# stacked barplot
ggplot(SB0206_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
  geom_bar(position="stack", stat="identity") +
  scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
  #ylim(0,0.025) +
  theme_bw()